last updated: 2023-11-14
Install Packages
As usual, make sure we have the right packages for this exercise
if (!require("pacman")) install.packages("pacman"); library(pacman)
## Loading required package: pacman
# let's load all of the files we were using and want to have again today
p_load("tidyverse", "knitr", "readr",
"pander", "BiocManager",
"dplyr", "stringr",
"statmod", # required dependency, need to load manually on some macOS versions.
"Glimma", # beautifies limma results
"purrr", # for working with lists (beautify column names)
"reactable") # for pretty tables.
# for today:
# p_load("WGCNA") # WGCNA is available on CRAN
if (!require("WGCNA")) BiocManager::install("WGCNA", force=TRUE); library(WGCNA)
## Loading required package: WGCNA
## Loading required package: dynamicTreeCut
## Loading required package: fastcluster
##
## Attaching package: 'fastcluster'
## The following object is masked from 'package:stats':
##
## hclust
##
##
## Attaching package: 'WGCNA'
## The following object is masked from 'package:stats':
##
## cor
#
# # We also need these Bioconductor packages today.
p_load("edgeR")
p_load("matrixStats")
p_load("org.Sc.sgd.db")
p_load("cowplot")
p_load("igraph")
# p_load("AnnotationDbi", "org.Sc.sgd.db", "ggVennDiagram")
# p_load("janitor")
# p_load("RColorBrewer")
# p_load("UpSetR")
# p_load("ComplexHeatmap")
# p_load("enrichplot")
# p_load("clusterProfiler")
# p_load("factoextra", "NbClust")
# p_load("ggpubr", "corrplot", "stringr", "tidyr")
# #NOTE: edgeR loads limma as a dependency
# library(ComplexHeatmap)
# library(RColorBrewer)
# library(limma)
# library(org.Sc.sgd.db)
#
# # choose number clusters
# library(factoextra)
# library(NbClust)
#
# # corr plots
# library(ggpubr)
# library(corrplot)
# library(stringr)
# library(tidyr)
Use limma to identify differential patterns of proteomic and transcriptomic changes in a time series heat shock experiment.
At the end of this exercise, you should be able to:
# for ease of use, set max number of digits after decimal
options(digits=3)
We are downloading the counts for the non-subsampled fastq files
from a Github repository using the code below. Just as in previous
exercises, assign the data to the variable counts. You can
change the file path if you have saved it to your computer in a
different location.
# proteome_HS <- read.delim("https://github.com/clstacy/GenomicDataAnalysis_Fa23/raw/main/data/heat_shock/proteomic_FC/HS_Proteomics_Data.txt",
# sep="\t",
# header=T,
# row.names=1)
# Load gene file used of raw counts
counts <- read.delim('https://github.com/clstacy/GenomicDataAnalysis_Fa23/raw/main/data/YPS606_TF_Rep1_4_ExpectedCounts.txt',
sep = "\t",
header = T,
row.names = 1
)
col_sel = names(counts) # Get all but first column name
mdata <- counts %>%
rownames_to_column("gene_id") %>%
tidyr::pivot_longer(
., # The dot is the the input data, magrittr tutorial
col = all_of(col_sel)
) %>%
separate_wider_delim(name, delim = "_", names = c("strain", "stress", "replicate"),cols_remove = FALSE)
(
p <- mdata %>%
ggplot(., aes(x = replicate, y = value)) + # x = treatment, y = RNA Seq count
geom_violin() + # violin plot, show distribution
geom_point(alpha = 0.2) + # scatter plot
theme_bw() +
theme(
axis.text.x = element_text(angle = 90) # Rotate treatment text
) +
labs(x = "Treatment Groups", y = "RNA Seq Counts") +
facet_grid(cols = vars(stress),rows = vars(strain), drop = TRUE, scales = "free_x") # Facet by hour
)
Let’s use edgeR to normalize.
# read into a DGElist
y <- edgeR::DGEList(counts,
group=as.factor(
# remove replicate from name
sub("_[^_]+$", "", colnames(counts))
),
genes = rownames(counts)
)
# filter low counts
keep <- rowSums(cpm(y) > 1) >= 4
y <- y[keep,]
# normalize with cpm
normalized_counts <- cpm(y,
log = TRUE)
str(normalized_counts)
## num [1:5756, 1:48] 4.59 5.07 10.95 0.18 8.03 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:5756] "YAL001C" "YAL002W" "YAL003W" "YAL004W" ...
## ..$ : chr [1:48] "YPS606_Mock_Rep1" "YPS606_Mock_Rep2" "YPS606_Mock_Rep3" "YPS606_Mock_Rep4" ...
# get th row
rv_wpn <- matrixStats::rowVars(normalized_counts)
summary(rv_wpn)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01 0.15 0.31 0.79 0.74 20.77
# get variance corresponding to 95th percentile
q95_wpn <- quantile( matrixStats::rowVars(normalized_counts), .95) # <= changed to 95 quantile
# get variance corresponding to 97.5th percentile
q975_wpn <- quantile( matrixStats::rowVars(normalized_counts), .975)
# to reduce dataset, only get genes with variance greater than above
expr_normalized <- normalized_counts[ rv_wpn > q975_wpn, ]
str(expr_normalized)
## num [1:144, 1:48] 9.196 7.176 6.818 0.763 -0.927 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:144] "YBL039C" "YBL042C" "YBL054W" "YBR072W" ...
## ..$ : chr [1:48] "YPS606_Mock_Rep1" "YPS606_Mock_Rep2" "YPS606_Mock_Rep3" "YPS606_Mock_Rep4" ...
Notice the number of genes retained is much fewer now.
What are the distributions of counts of the normalized cpm’s of the retained genes?
expr_normalized_df <- data.frame(expr_normalized) %>%
mutate(
Gene_id = row.names(expr_normalized)
) %>%
pivot_longer(-Gene_id)
expr_normalized_df %>% ggplot(., aes(x = name, y = value)) +
geom_violin() +
geom_point(alpha=0.5) +
theme_bw() +
theme(
axis.text.x = element_text( angle = 90)
) +
ylim(0, NA) +
labs(
title = "Normalized and 95 quantile Expression",
x = "treatment",
y = "normalized expression"
)
## Warning: Removed 746 rows containing non-finite values (`stat_ydensity()`).
## Warning: Removed 746 rows containing missing values (`geom_point()`).
Now let’s transpose the data and prepare the dataset for WGCNA.
input_mat = t(expr_normalized)
# Look at first 5 rows and 10 columns
input_mat[1:5,1:10]
## YBL039C YBL042C YBL054W YBR072W YBR116C YBR117C YBR126C
## YPS606_Mock_Rep1 9.20 7.18 6.818 0.763 -0.927 0.856 6.22
## YPS606_Mock_Rep2 9.35 7.04 7.120 0.412 -0.761 0.998 6.19
## YPS606_Mock_Rep3 9.11 7.16 6.537 0.801 0.234 1.064 6.20
## YPS606_Mock_Rep4 9.33 7.08 6.860 0.847 -1.379 1.198 6.30
## YPS606_EtOH_Rep1 1.80 1.55 0.401 9.044 0.569 3.295 11.66
## YBR145W YBR147W YBR169C
## YPS606_Mock_Rep1 2.92 -1.00 1.73
## YPS606_Mock_Rep2 3.14 -1.18 1.77
## YPS606_Mock_Rep3 2.71 -1.46 2.10
## YPS606_Mock_Rep4 2.84 -1.90 1.89
## YPS606_EtOH_Rep1 10.48 2.77 7.80
We can see now that the rows = treatments and columns = gene probes. We’re ready to start WGCNA. A correlation network will be a complete network (all genes are connected to all other genes). Ergo we will need to pick a threshhold value (if correlation is below threshold, remove the edge). We assume the true biological network follows a scale-free structure (see papers by Albert Barabasi).
To do that, WGCNA will try a range of soft thresholds and create a diagnostic plot. This step will take several minutes so feel free to run and get coffee.
# library(WGCNA)
allowWGCNAThreads(nThreads = 4) # allow multi-threading (optional)
## Allowing multi-threading with up to 4 threads.
#> Allowing multi-threading with up to 4 threads.
# Choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to = 20, by = 2))
# Call the network topology analysis function
sft = pickSoftThreshold(
input_mat, # <= Input data
#blockSize = 30,
powerVector = powers,
verbose = 5
)
## pickSoftThreshold: will use block size 144.
## pickSoftThreshold: calculating connectivity for given powers...
## ..working on genes 1 through 144 of 144
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.4920 1.7600 6.60e-01 86.9 95.30 106.0
## 2 2 0.5340 0.7260 7.30e-01 65.4 74.90 89.4
## 3 3 0.5560 0.5630 6.18e-01 52.9 61.40 77.7
## 4 4 0.3590 0.4430 3.52e-01 44.4 51.10 68.6
## 5 5 0.3540 0.3580 3.26e-01 38.1 43.70 61.2
## 6 6 0.3710 0.2900 3.77e-01 33.3 37.90 55.0
## 7 7 0.4270 0.2790 4.84e-01 29.5 33.20 49.8
## 8 8 0.2720 0.2380 2.61e-01 26.4 29.20 45.3
## 9 9 0.2450 0.1960 2.74e-01 23.8 26.10 41.5
## 10 10 0.1280 0.1400 1.04e-01 21.6 24.00 38.5
## 11 12 0.0253 0.0391 4.58e-05 18.1 19.70 34.9
## 12 14 0.0871 -0.0752 -1.59e-01 15.5 16.00 32.0
## 13 16 0.3840 -0.1700 2.23e-01 13.4 13.20 29.6
## 14 18 0.6510 -0.2600 6.30e-01 11.8 11.10 27.6
## 15 20 0.5960 -0.3890 5.70e-01 10.4 9.62 25.9
par(mfrow = c(1,2));
cex1 = 0.9;
plot(sft$fitIndices[, 1],
-sign(sft$fitIndices[, 3]) * sft$fitIndices[, 2],
xlab = "Soft Threshold (power)",
ylab = "Scale Free Topology Model Fit, signed R^2",
main = paste("Scale independence")
)
text(sft$fitIndices[, 1],
-sign(sft$fitIndices[, 3]) * sft$fitIndices[, 2],
labels = powers, cex = cex1, col = "red"
)
abline(h = 0.90, col = "red")
plot(sft$fitIndices[, 1],
sft$fitIndices[, 5],
xlab = "Soft Threshold (power)",
ylab = "Mean Connectivity",
type = "n",
main = paste("Mean connectivity")
)
text(sft$fitIndices[, 1],
sft$fitIndices[, 5],
labels = powers,
cex = cex1, col = "red")
Pick a soft threshold power near the curve of the plot, so here we could pick 7, 8 or 9. We’ll pick 9 but feel free to experiment with other powers to see how it affects your results. Now we can create the network using the blockwiseModules command. The blockwiseModule may take a while to run, since it is constructing the TOM (topological overlap matrix) and several other steps. While it runs, take a look at the blockwiseModule documentation (link to vignette[https://www.rdocumentation.org/packages/WGCNA/versions/1.69/topics/blockwiseModules]) for more information on the parameters. How might you change the parameters to get more or less modules?
picked_power = 7
temp_cor <- cor
cor <- WGCNA::cor # Force it to use WGCNA cor function (fix a namespace conflict issue)
netwk <- blockwiseModules(input_mat, # <= input here
# == Adjacency Function ==
power = picked_power, # <= power here
networkType = "signed",
# == Tree and Block Options ==
deepSplit = 2,
pamRespectsDendro = F,
# detectCutHeight = 0.75,
minModuleSize = 30,
maxBlockSize = 4000,
# == Module Adjustments ==
reassignThreshold = 0,
mergeCutHeight = 0.25,
# == TOM == Archive the run results in TOM file (saves time)
saveTOMs = T,
saveTOMFileBase = "ER",
# == Output Options
numericLabels = T,
verbose = 3)
## Calculating module eigengenes block-wise from all genes
## Flagging genes and samples with too many missing values...
## ..step 1
## ..Working on block 1 .
## TOM calculation: adjacency..
## ..will use 4 parallel threads.
## Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
## ..saving TOM for block 1 into file ER-block.1.RData
## ....clustering..
## ....detecting modules..
## ....calculating module eigengenes..
## ....checking kME in modules..
## ..removing 3 genes from module 2 because their KME is too low.
## ..merging modules that are too close..
## mergeCloseModules: Merging modules whose distance is less than 0.25
## Calculating new MEs...
cor <- temp_cor
# Convert labels to colors for plotting
mergedColors = labels2colors(netwk$colors)
# Plot the dendrogram and the module colors underneath
plotDendroAndColors(
netwk$dendrograms[[1]],
mergedColors[netwk$blockGenes[[1]]],
"Module colors",
dendroLabels = FALSE,
hang = 0.03,
addGuide = TRUE,
guideHang = 0.05 )
module_df <- data.frame(
gene_id = names(netwk$colors),
colors = labels2colors(netwk$colors)
)
# Get Module Eigengenes per cluster
MEs0 <- moduleEigengenes(input_mat, mergedColors)$eigengenes
# Reorder modules so similar modules are next to each other
MEs0 <- orderMEs(MEs0)
module_order = names(MEs0) %>% gsub("ME","", .)
# Add treatment names
MEs0$treatment = row.names(MEs0)
# tidy & plot data
mME = MEs0 %>%
pivot_longer(-treatment) %>%
mutate(
name = gsub("ME", "", name),
name = factor(name, levels = module_order)
)
mME %>% ggplot(., aes(x=treatment, y=name, fill=value)) +
geom_tile() +
theme_bw() +
scale_fill_gradient2(
low = "blue",
high = "red",
mid = "white",
midpoint = 0,
limit = c(-1,1)) +
theme(axis.text.x = element_text(angle=90)) +
labs(title = "Module-trait Relationships", y = "Modules", fill="corr")
# pick out a few modules of interest here
modules_of_interest = c("turquoise", "blue", "brown", "grey")
# Pull out list of genes in that module
submod = module_df %>%
subset(colors %in% modules_of_interest)
row.names(module_df) = module_df$gene_id
# Get normalized expression for those genes
subexpr = expr_normalized[submod$gene_id,]
submod_df = data.frame(subexpr) %>%
mutate(
gene_id = row.names(.)
) %>%
pivot_longer(-gene_id) %>%
mutate(
module = module_df[gene_id,]$colors
)
submod_df %>% ggplot(., aes(x=name, y=value, group=gene_id)) +
geom_line(aes(color = module),
alpha = 0.2) +
theme_bw() +
theme(
axis.text.x = element_text(angle = 90)
) +
facet_grid(rows = vars(module),
# scales = "free_y"
) +
labs(x = "treatment",
y = "normalized expression") +
scale_color_identity() +
scale_y_log10()
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 98 rows containing missing values (`geom_line()`).
The network file can be generated for Cytoscape or as an edge/vertices file.
genes_of_interest = module_df %>%
subset(colors %in% modules_of_interest)
expr_of_interest = expr_normalized[genes_of_interest$gene_id,]
expr_of_interest[1:5,1:5]
## YPS606_Mock_Rep1 YPS606_Mock_Rep2 YPS606_Mock_Rep3 YPS606_Mock_Rep4
## YBL039C 9.196 9.345 9.109 9.328
## YBL042C 7.176 7.039 7.164 7.077
## YBL054W 6.818 7.120 6.537 6.860
## YBR072W 0.763 0.412 0.801 0.847
## YBR116C -0.927 -0.761 0.234 -1.379
## YPS606_EtOH_Rep1
## YBL039C 1.804
## YBL042C 1.550
## YBL054W 0.401
## YBR072W 9.044
## YBR116C 0.569
# Only recalculate TOM for modules of interest (faster, altho there's some online discussion if this will be slightly off)
TOM = TOMsimilarityFromExpr(t(expr_of_interest),
power = picked_power)
## TOM calculation: adjacency..
## ..will use 4 parallel threads.
## Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
# Add gene names to row and columns
row.names(TOM) = row.names(expr_of_interest)
colnames(TOM) = row.names(expr_of_interest)
edge_list = data.frame(TOM) %>%
mutate(
gene1 = row.names(.)
) %>%
pivot_longer(-gene1) %>%
dplyr::rename(gene2 = name, correlation = value) %>%
unique() %>%
subset(!(gene1==gene2)) %>%
mutate(
module1 = module_df[gene1,]$colors,
module2 = module_df[gene2,]$colors
)
head(edge_list)
## # A tibble: 6 Ă— 5
## gene1 gene2 correlation module1 module2
## <chr> <chr> <dbl> <chr> <chr>
## 1 YBL039C YBL042C 0.695 blue blue
## 2 YBL039C YBL054W 0.690 blue blue
## 3 YBL039C YBR072W 0.565 blue turquoise
## 4 YBL039C YBR116C 0.000369 blue brown
## 5 YBL039C YBR117C 0.00224 blue brown
## 6 YBL039C YBR126C 0.243 blue turquoise
adjacency = adjacency(input_mat, power=12, type="signed")
adjacency[adjacency < 0] = 0
adjacency[adjacency > 1] = 1
TOM = TOMsimilarity(adjacency, TOMType="signed")
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
# Add gene names to row and columns
row.names(TOM) = AnnotationDbi::select(org.Sc.sgd.db,keys=row.names(adjacency),columns="GENENAME") %>% mutate(label = case_when(
is.na(GENENAME) ~ ORF,
TRUE ~ GENENAME
)) %>% pull(label)
## 'select()' returned 1:1 mapping between keys and columns
colnames(TOM) = AnnotationDbi::select(org.Sc.sgd.db,keys=row.names(adjacency),columns="GENENAME") %>% mutate(label = case_when(
is.na(GENENAME) ~ ORF,
TRUE ~ GENENAME
)) %>% pull(label)
## 'select()' returned 1:1 mapping between keys and columns
adj <- TOM
adj[adj > 0.5] = 1
adj[adj != 1] = 0
network <- graph.adjacency(adj)
network <- simplify(network,) # removes self-loops
cor <- WGCNA::cor
results <- WGCNA::blockwiseModules(input_mat, power=12, TOMType="signed", networkType="signed")
cor <- stats::cor
V(network)$color <- results$colors
par(mar=c(0,0,0,0))
# remove unconnected nodes
network <- delete.vertices(network, degree(network)==0)
plot(network, layout=layout.fruchterman.reingold(network, niter=10000),type="o")
plot(network, layout=layout_with_mds(network), edge.arrow.size = 0.1)
FC_list <- read_delim("https://github.com/clstacy/GenomicDataAnalysis_Fa23/raw/main/data/DE_yeast_TF_stress.txt.gz",
delim = "\t", escape_double = FALSE,
name_repair = "universal",
show_col_types=FALSE,
trim_ws = TRUE) %>%
dplyr::select(ID, contains("logFC")) %>%
dplyr::select(ID, contains(".v.")) %>%
column_to_rownames("ID")
## New names:
## • `logFC: WT NaCl response` -> `logFC..WT.NaCl.response`
## • `FDR: WT NaCl response` -> `FDR..WT.NaCl.response`
## • `logFC: msn24 mutant NaCl response` -> `logFC..msn24.mutant.NaCl.response`
## • `FDR: msn24 mutant NaCl response` -> `FDR..msn24.mutant.NaCl.response`
## • `logFC: yap1 mutant NaCl response` -> `logFC..yap1.mutant.NaCl.response`
## • `FDR: yap1 mutant NaCl response` -> `FDR..yap1.mutant.NaCl.response`
## • `logFC: skn7 mutant NaCl response` -> `logFC..skn7.mutant.NaCl.response`
## • `FDR: skn7 mutant NaCl response` -> `FDR..skn7.mutant.NaCl.response`
## • `logFC: WT EtOH response` -> `logFC..WT.EtOH.response`
## • `FDR: WT EtOH response` -> `FDR..WT.EtOH.response`
## • `logFC: msn24 mutant EtOH response` -> `logFC...msn24.mutant.EtOH.response`
## • `FDR: msn24 mutan EtOH response` -> `FDR...msn24.mutan..EtOH.response`
## • `logFC: yap1 mutant EtOH response` -> `logFC...yap1.mutant.EtOH.response`
## • `FDR: yap1 mutant EtOH response` -> `FDR...yap1.mutant.EtOH.response`
## • `logFC: skn7 mutant EtOH response` -> `logFC...skn7.mutant.EtOH.response`
## • `FDR: skn7 mutant EtOH response` -> `FDR...skn7.mutant.EtOH.response`
## • `logFC: WT v msn24 mutant: NaCl response` ->
## `logFC..WT.v.msn24.mutant..NaCl.response`
## • `FDR: WT v msn24 mutant: NaCl response` ->
## `FDR..WT.v.msn24.mutant..NaCl.response`
## • `logFC: WT v yap1 mutant: NaCl response` ->
## `logFC..WT.v.yap1.mutant..NaCl.response`
## • `FDR: WT v yap1 mutant: NaCl response` ->
## `FDR..WT.v.yap1.mutant..NaCl.response`
## • `logFC: WT v skn7 mutant: NaCl response` ->
## `logFC..WT.v.skn7.mutant..NaCl.response`
## • `FDR: WT v skn7 mutant NaCl response` ->
## `FDR..WT.v.skn7.mutant.NaCl.response`
## • `logFC: WT v msn24 mutant: EtOH response` ->
## `logFC..WT.v.msn24.mutant..EtOH.response`
## • `FDR: WT v msn24 mutant: EtOH response` ->
## `FDR..WT.v.msn24.mutant..EtOH.response`
## • `logFC: WT v yap1 mutant: EtOH response` ->
## `logFC..WT.v.yap1.mutant..EtOH.response`
## • `FDR: WT v yap1 mutant: EtOH response` ->
## `FDR..WT.v.yap1.mutant..EtOH.response`
## • `logFC: WT v skn7 mutant: EtOH response` ->
## `logFC..WT.v.skn7.mutant..EtOH.response`
## • `FDR: WT v skn7 mutant: EtOH response` ->
## `FDR..WT.v.skn7.mutant..EtOH.response`
FC_list_t <- t(FC_list)
cv_wpn <- matrixStats::colVars(FC_list_t)
# get variance corresponding to 95th percentile
q95_wpn <- quantile( matrixStats::colVars(FC_list_t), .75) # <= changed to 95 quantile
# to reduce dataset, only get genes with variance greater than above
FC_list_t <- FC_list_t[, cv_wpn > q95_wpn ]
adjacency = adjacency(FC_list_t, power=12, type="signed")
adjacency[adjacency < 0] = 0
adjacency[adjacency > 1] = 1
TOM = TOMsimilarity(adjacency, TOMType="unsigned")
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
# Add gene names to row and columns
row.names(TOM) = AnnotationDbi::select(org.Sc.sgd.db,keys=row.names(adjacency),columns="GENENAME") %>% mutate(label = case_when(
is.na(GENENAME) ~ ORF,
TRUE ~ GENENAME
)) %>% pull(label)
## 'select()' returned 1:1 mapping between keys and columns
colnames(TOM) = AnnotationDbi::select(org.Sc.sgd.db,keys=row.names(adjacency),columns="GENENAME") %>% mutate(label = case_when(
is.na(GENENAME) ~ ORF,
TRUE ~ GENENAME
)) %>% pull(label)
## 'select()' returned 1:1 mapping between keys and columns
adj <- TOM
adj[adj > 0.3] = 1
adj[adj != 1] = 0
network <- graph.adjacency(adj)
network <- simplify(network) # removes self-loops
cor <- WGCNA::cor
results <- WGCNA::blockwiseModules(FC_list_t, power=12, TOMType="unsigned", networkType="unsigned")
cor <- stats::cor
V(network)$color <- results$colors
par(mar=c(0,0,0,0))
# remove unconnected nodes
network <- delete.vertices(network, degree(network)==0)
# network <- delete.vertices(network, V(network)$color=="blue")
# network <- delete.vertices(network, V(network)$color=="yellow")
plot(network, layout=layout.fruchterman.reingold(network, niter=1000),
vertex.size=3,edge.arrow.size=0.1, vertex.label.cex = 0.05, vertex.label.color= "black", rescale=TRUE, ylim=c(-1,1),xlim=c(-1,1))
# plot(network, layout=layout_with_mds(network), vertex.label.cex = 0.5,
# edge.arrow.size = 0.1, vertex.size=7,rescale=FALSE)
# plot(network, layout=layout_on_grid(network),
# vertex.size=5,edge.arrow.size=0.1, vertex.label.cex = 0.5,rescale=TRUE)#, ylim=c(-5,-2),xlim=c(-8,5), asp = 1)
module_df <- data.frame(
gene_id = names(results$colors),
colors = labels2colors(results$colors)
)
# Convert labels to colors for plotting
mergedColors = labels2colors(results$colors)
# Plot the dendrogram and the module colors underneath
plotDendroAndColors(
results$dendrograms[[1]],
mergedColors[results$blockGenes[[1]]],
"Module colors",
dendroLabels = FALSE,
hang = 0.03,
addGuide = TRUE,
guideHang = 0.05 )
# pick out a few modules of interest here
modules_of_interest = c("midnightblue", "lightyellow", "cyan", "blue", "salmon", "turquoise", "royalblue", "purple", "lightcyan", "red", "lightgreen", "yellow", "grey60", "green", "brown", "tan", "pink", "greenyellow", "magenta", "black")
# Pull out list of genes in that module
submod = module_df %>%
subset(colors %in% modules_of_interest)
row.names(module_df) = module_df$gene_id
# Get normalized expression for those genes
subexpr = FC_list[submod$gene_id,]
submod_df = data.frame(subexpr) %>%
mutate(
gene_id = row.names(.)
) %>%
pivot_longer(-gene_id) %>%
mutate(
module = module_df[gene_id,]$colors
)
submod_df %>% ggplot(., aes(x=name, y=value, group=gene_id)) +
geom_line(aes(color = module),
alpha = 0.2) +
theme_bw() +
theme(
axis.text.x = element_text(angle = 40)
) +
facet_grid(rows = vars(module),
# scales = "free_y"
) +
scale_color_identity() +
labs(x = "treatment",
y = "normalized expression")# +
# scale_y_log10()
# Get Module Eigengenes per cluster
MEs0 <- moduleEigengenes(FC_list_t, mergedColors)$eigengenes
# Reorder modules so similar modules are next to each other
MEs0 <- orderMEs(MEs0)
module_order = names(MEs0) %>% gsub("ME","", .)
# Add treatment names
MEs0$treatment = row.names(MEs0)
# tidy & plot data
mME = MEs0 %>%
pivot_longer(-treatment) %>%
mutate(
name = gsub("ME", "", name),
name = factor(name, levels = module_order)
)
mME %>% ggplot(., aes(x=treatment, y=name, fill=value)) +
geom_tile() +
theme_bw() +
scale_fill_gradient2(
low = "blue",
high = "red",
mid = "white",
midpoint = 0,
limit = c(-1,1)) +
theme(axis.text.x = element_text(angle=20)) +
labs(title = "Module-trait Relationships", y = "Modules", fill="corr")
GO enrichment on those modules
go_results_yellow <- clusterProfiler::enrichGO(
gene = submod_df %>% filter(module=="lightyellow") %>% pull(gene_id) %>% unique(),
OrgDb = "org.Sc.sgd.db",
keyType = "ORF",
ont = "ALL"
) |>
# let's add a 'richFactor' column that gives us the proportion of genes DE in the term
mutate(richFactor = Count / as.numeric(sub("/\\d+", "", BgRatio)))
# take a peak at the results
head(go_results_yellow)
## ONTOLOGY ID
## GO:0006091 BP GO:0006091
## GO:0015980 BP GO:0015980
## GO:0006979 BP GO:0006979
## GO:0034599 BP GO:0034599
## GO:0005975 BP GO:0005975
## GO:0062197 BP GO:0062197
## Description GeneRatio
## GO:0006091 generation of precursor metabolites and energy 36/289
## GO:0015980 energy derivation by oxidation of organic compounds 27/289
## GO:0006979 response to oxidative stress 22/289
## GO:0034599 cellular response to oxidative stress 20/289
## GO:0005975 carbohydrate metabolic process 31/289
## GO:0062197 cellular response to chemical stress 24/289
## BgRatio pvalue p.adjust qvalue
## GO:0006091 226/6424 1.526607e-11 1.581565e-08 1.446260e-08
## GO:0015980 164/6424 3.020572e-09 1.564656e-06 1.430797e-06
## GO:0006979 127/6424 3.626456e-08 1.252336e-05 1.145197e-05
## GO:0034599 111/6424 7.853587e-08 2.034079e-05 1.860060e-05
## GO:0005975 245/6424 1.238047e-07 2.565233e-05 2.345773e-05
## GO:0062197 161/6424 1.692218e-07 2.921896e-05 2.671922e-05
## geneID
## GO:0006091 YBR026C/YCL035C/YDL021W/YDR148C/YDR516C/YEL011W/YEL039C/YER054C/YER067W/YFR015C/YFR017C/YFR053C/YGL191W/YGR043C/YGR248W/YHR001W-A/YIL111W/YIL125W/YJL052W/YJL166W/YJR121W/YKL035W/YKL085W/YKL148C/YLR258W/YML070W/YML110C/YML120C/YMR081C/YMR105C/YMR145C/YMR315W/YOR178C/YOR347C/YPL172C/YPR160W
## GO:0015980 YBR026C/YDR148C/YEL011W/YEL039C/YER054C/YER067W/YFR015C/YFR017C/YGL191W/YHR001W-A/YIL111W/YIL125W/YJL166W/YJR121W/YKL035W/YKL085W/YKL148C/YLR258W/YML070W/YML110C/YML120C/YMR081C/YMR105C/YMR145C/YOR178C/YPL172C/YPR160W
## GO:0006979 YBL064C/YBR046C/YBR126C/YCL035C/YCR083W/YDL124W/YDR231C/YFL014W/YFR014C/YGR043C/YGR088W/YHR008C/YHR104W/YJR096W/YKL026C/YKL062W/YKL150W/YKR066C/YLR109W/YML028W/YMR037C/YMR250W
## GO:0034599 YBL064C/YBR046C/YBR126C/YCL035C/YCR083W/YDL124W/YFL014W/YFR014C/YGR043C/YHR008C/YHR104W/YJR096W/YKL026C/YKL062W/YKL150W/YKR066C/YLR109W/YML028W/YMR037C/YMR250W
## GO:0005975 YBR001C/YBR053C/YBR105C/YBR126C/YDL021W/YDR001C/YDR516C/YEL011W/YER054C/YFR015C/YFR017C/YFR053C/YGR043C/YGR143W/YGR248W/YHR104W/YJL052W/YJR096W/YKL035W/YKL085W/YLR258W/YML028W/YML070W/YML100W/YMR105C/YMR196W/YMR261C/YMR305C/YOR178C/YOR347C/YPR160W
## GO:0062197 YAL005C/YAL028W/YBL064C/YBR046C/YBR072W/YBR126C/YCL035C/YCR083W/YDL124W/YFL014W/YFR014C/YGR043C/YHR008C/YHR104W/YJR096W/YKL026C/YKL062W/YKL150W/YKR066C/YLR109W/YML028W/YMR037C/YMR250W/YNL234W
## Count richFactor
## GO:0006091 36 0.1592920
## GO:0015980 27 0.1646341
## GO:0006979 22 0.1732283
## GO:0034599 20 0.1801802
## GO:0005975 31 0.1265306
## GO:0062197 24 0.1490683
ggplot(go_results_yellow,
showCategory = 15,
aes(richFactor, fct_reorder(Description, richFactor))) +
geom_segment(aes(xend = 0, yend = Description)) +
geom_point(aes(color = p.adjust, size = Count)) +
scale_color_gradientn(
colours = c("#f7ca64", "#46bac2", "#7e62a3"),
trans = "log10",
guide = guide_colorbar(reverse = TRUE, order = 1)
) +
scale_size_continuous(range = c(2, 10)) +
xlab("Rich Factor") +
ylab(NULL) +
ggtitle("Enriched GO Categories") +
theme_bw()
# create list of genes in each one
ID_network_list <- submod_df %>%
# data.frame() %>%
# tibble::rownames_to_column("ORF") %>%
# tidyr::pivot_longer(-ORF,names_to="Time", values_to = "DE_direction") %>%
# mutate(Time = readr::parse_number(Time),
# DE_name = case_when(DE_direction == 1 ~ "up",
# DE_direction == -1 ~ "down",
# TRUE~NA),
# # create grouping variable
# group=paste0("DE_",DE_name,"_0",Time, "min")) %>%
# # modify names to pad leading 0's to fix time order
# mutate(group = str_replace(group,"_0120", "_120")) %>%
# mutate(group = str_replace(group,"_0240", "_240")) %>%
# mutate(entrezID=AnnotationDbi::select(org.Sc.sgd.db,keys=ORF,columns="ENTREZID",
# multiVals = "first")$ENTREZID) %>%
group_by(module) %>%
# only get genes that are DE
# filter(DE_direction != 0) %>%
group_split() %>% # split into many lists
purrr::set_names(purrr::map_chr(., ~.x$module[1])) %>% # assign names
# get just the ORF names
map(., ~ .x |> pull(all_of("gene_id")))
GO_enrich_network_all <- clusterProfiler::compareCluster(
geneCluster = ID_network_list,
ont = "BP",
OrgDb = org.Sc.sgd.db,
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
readable = FALSE,
fun = 'enrichGO',
keyType = 'ORF'
) |>
# let's add a 'richFactor' column that gives us the proportion of genes DE in the term
mutate(richFactor = Count / as.numeric(sub("/\\d+", "", BgRatio)))
clusterProfiler::dotplot(GO_enrich_network_all,
showCategory=10
# if we choose "ALL", we can show them all with facet.
#facet="ONTOLOGY",
) + scale_x_discrete(guide = guide_axis(angle = 60))
# get rid of sticks to save space
GO_enrich_network_all %>%
# GO_enrich_proteome_all %>%
as.data.frame() %>%
group_by(Cluster) %>%
slice_min(order_by=p.adjust, n = 8) %>%
# separate_wider_delim(Cluster, delim = "_", names = c("junk","DE", "time")) %>%
# mutate(Time = readr::parse_number(time)) %>%
ggplot(.,
aes(Cluster, fct_reorder(Description, richFactor))) +
geom_point(aes(color = p.adjust, size = Count)) +
scale_color_gradientn(
colours = c("#f7ca64", "#46bac2", "#7e62a3"),
trans = "log10",
guide = guide_colorbar(reverse = TRUE, order = 1)
) +
scale_size_continuous(range = c(2, 12)) +
scale_y_discrete(labels = function(x) str_wrap(x, width = 25)) +
xlab("Time (minutes)") +
ylab(NULL) +
ggtitle("Enriched GO Categories") +
# facet_wrap(~DE, scales = "free_y") +
theme_bw() +
theme(text = element_text(size=14))
I want to use just the counts, and build a network for each different knockout+stress combination? Could group either by mutant, stress, or both. It feels like both makes most sense.
# DE_genes
DE_genes <- read_delim("https://github.com/clstacy/GenomicDataAnalysis_Fa23/raw/main/data/DE_yeast_TF_stress.txt.gz",
delim = "\t", escape_double = FALSE,
name_repair = "universal",
show_col_types=FALSE,
trim_ws = TRUE) %>%
dplyr::select(ID, contains("FDR")) %>%
dplyr::select(ID, contains(".v.")) %>%
pivot_longer(-ID, names_to = "contrast", values_to = "FDR") %>%
filter(FDR < 0.01) %>%
pull(ID) %>%
unique()
## New names:
## • `logFC: WT NaCl response` -> `logFC..WT.NaCl.response`
## • `FDR: WT NaCl response` -> `FDR..WT.NaCl.response`
## • `logFC: msn24 mutant NaCl response` -> `logFC..msn24.mutant.NaCl.response`
## • `FDR: msn24 mutant NaCl response` -> `FDR..msn24.mutant.NaCl.response`
## • `logFC: yap1 mutant NaCl response` -> `logFC..yap1.mutant.NaCl.response`
## • `FDR: yap1 mutant NaCl response` -> `FDR..yap1.mutant.NaCl.response`
## • `logFC: skn7 mutant NaCl response` -> `logFC..skn7.mutant.NaCl.response`
## • `FDR: skn7 mutant NaCl response` -> `FDR..skn7.mutant.NaCl.response`
## • `logFC: WT EtOH response` -> `logFC..WT.EtOH.response`
## • `FDR: WT EtOH response` -> `FDR..WT.EtOH.response`
## • `logFC: msn24 mutant EtOH response` -> `logFC...msn24.mutant.EtOH.response`
## • `FDR: msn24 mutan EtOH response` -> `FDR...msn24.mutan..EtOH.response`
## • `logFC: yap1 mutant EtOH response` -> `logFC...yap1.mutant.EtOH.response`
## • `FDR: yap1 mutant EtOH response` -> `FDR...yap1.mutant.EtOH.response`
## • `logFC: skn7 mutant EtOH response` -> `logFC...skn7.mutant.EtOH.response`
## • `FDR: skn7 mutant EtOH response` -> `FDR...skn7.mutant.EtOH.response`
## • `logFC: WT v msn24 mutant: NaCl response` ->
## `logFC..WT.v.msn24.mutant..NaCl.response`
## • `FDR: WT v msn24 mutant: NaCl response` ->
## `FDR..WT.v.msn24.mutant..NaCl.response`
## • `logFC: WT v yap1 mutant: NaCl response` ->
## `logFC..WT.v.yap1.mutant..NaCl.response`
## • `FDR: WT v yap1 mutant: NaCl response` ->
## `FDR..WT.v.yap1.mutant..NaCl.response`
## • `logFC: WT v skn7 mutant: NaCl response` ->
## `logFC..WT.v.skn7.mutant..NaCl.response`
## • `FDR: WT v skn7 mutant NaCl response` ->
## `FDR..WT.v.skn7.mutant.NaCl.response`
## • `logFC: WT v msn24 mutant: EtOH response` ->
## `logFC..WT.v.msn24.mutant..EtOH.response`
## • `FDR: WT v msn24 mutant: EtOH response` ->
## `FDR..WT.v.msn24.mutant..EtOH.response`
## • `logFC: WT v yap1 mutant: EtOH response` ->
## `logFC..WT.v.yap1.mutant..EtOH.response`
## • `FDR: WT v yap1 mutant: EtOH response` ->
## `FDR..WT.v.yap1.mutant..EtOH.response`
## • `logFC: WT v skn7 mutant: EtOH response` ->
## `logFC..WT.v.skn7.mutant..EtOH.response`
## • `FDR: WT v skn7 mutant: EtOH response` ->
## `FDR..WT.v.skn7.mutant..EtOH.response`
counts_DE <- read.delim('https://github.com/clstacy/GenomicDataAnalysis_Fa23/raw/main/data/YPS606_TF_Rep1_4_ExpectedCounts.txt',
sep = "\t",
header = T,
row.names = 1
) %>%
rownames_to_column("ORF") %>%
dplyr::filter(ORF %in% DE_genes) %>%
column_to_rownames("ORF")
# read into a DGElist
y_DE <- edgeR::DGEList(counts_DE,
group=as.factor(
# remove replicate from name
sub("_[^_]+$", "", colnames(counts_DE))
),
genes = rownames(counts_DE)
)
# filter low counts
keep_DE <- rowSums(cpm(y_DE) > 1) >= 4
y_DE <- y_DE[keep_DE,]
# normalize with cpm
normalized_counts_DE <- cpm(y_DE,
log = FALSE)
expr_normalized_DE <- normalized_counts_DE#[ rv_wpn > q975_wpn, ]
input_mat = t(expr_normalized_DE)
picked_power = 7
temp_cor <- cor
cor <- WGCNA::cor # Force it to use WGCNA cor function (fix a namespace conflict issue)
netwk <- blockwiseModules(input_mat, # <= input here
# == Adjacency Function ==
power = picked_power, # <= power here
networkType = "signed",
# == Tree and Block Options ==
deepSplit = 2,
pamRespectsDendro = F,
# detectCutHeight = 0.75,
minModuleSize = 30,
maxBlockSize = 4000,
# == Module Adjustments ==
reassignThreshold = 0,
mergeCutHeight = 0.25,
# == TOM == Archive the run results in TOM file (saves time)
saveTOMs = T,
saveTOMFileBase = "ER",
# == Output Options
numericLabels = T,
verbose = 3)
## Calculating module eigengenes block-wise from all genes
## Flagging genes and samples with too many missing values...
## ..step 1
## ..Working on block 1 .
## TOM calculation: adjacency..
## ..will use 4 parallel threads.
## Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
## ..saving TOM for block 1 into file ER-block.1.RData
## ....clustering..
## ....detecting modules..
## ....calculating module eigengenes..
## ....checking kME in modules..
## ..removing 20 genes from module 1 because their KME is too low.
## ..removing 5 genes from module 2 because their KME is too low.
## ..removing 6 genes from module 3 because their KME is too low.
## ..merging modules that are too close..
## mergeCloseModules: Merging modules whose distance is less than 0.25
## Calculating new MEs...
cor <- temp_cor
# Convert labels to colors for plotting
mergedColors = labels2colors(netwk$colors)
# Plot the dendrogram and the module colors underneath
plotDendroAndColors(
netwk$dendrograms[[1]],
mergedColors[netwk$blockGenes[[1]]],
"Module colors",
dendroLabels = FALSE,
hang = 0.03,
addGuide = TRUE,
guideHang = 0.05 )
module_df <- data.frame(
gene_id = names(netwk$colors),
colors = labels2colors(netwk$colors)
)
# Get Module Eigengenes per cluster
MEs0 <- moduleEigengenes(input_mat, mergedColors)$eigengenes
# Reorder modules so similar modules are next to each other
MEs0 <- orderMEs(MEs0)
module_order = names(MEs0) %>% gsub("ME","", .)
# Add treatment names
MEs0$treatment = row.names(MEs0)
# tidy & plot data
mME = MEs0 %>%
pivot_longer(-treatment) %>%
mutate(
name = gsub("ME", "", name),
name = factor(name, levels = module_order)
)
mME %>% ggplot(., aes(x=treatment, y=name, fill=value)) +
geom_tile() +
theme_bw() +
scale_fill_gradient2(
low = "blue",
high = "red",
mid = "white",
midpoint = 0,
limit = c(-1,1)) +
theme(axis.text.x = element_text(angle=90)) +
labs(title = "Module-trait Relationships", y = "Modules", fill="corr")
# pick out a few modules of interest here
modules_of_interest = c("turquoise", "blue", "brown", "grey", "yellow")
# Pull out list of genes in that module
submod = module_df #%>%
# subset(colors %in% modules_of_interest)
row.names(module_df) = module_df$gene_id
# Get normalized expression for those genes
subexpr = expr_normalized_DE[submod$gene_id,]
submod_df = data.frame(subexpr) %>%
mutate(
gene_id = row.names(.)
) %>%
pivot_longer(-gene_id) %>%
mutate(
module = module_df[gene_id,]$colors
)
submod_df %>% ggplot(., aes(x=name, y=value, group=gene_id)) +
geom_line(aes(color = module),
alpha = 0.2) +
theme_bw() +
theme(
axis.text.x = element_text(angle = 90)
) +
facet_grid(rows = vars(module),
# scales = "free_y"
) +
labs(x = "treatment",
y = "normalized expression") +
scale_color_identity() +
scale_y_log10()
## Warning: Transformation introduced infinite values in continuous y-axis
The network file can be generated for Cytoscape or as an edge/vertices file.
genes_of_interest = module_df #%>%
#subset(colors %in% modules_of_interest)
expr_of_interest = expr_normalized_DE#[genes_of_interest$gene_id,]
expr_of_interest[1:5,1:5]
## YPS606_Mock_Rep1 YPS606_Mock_Rep2 YPS606_Mock_Rep3 YPS606_Mock_Rep4
## YAL003W 3826.774552 4339.529451 3615.74469 4268.77586
## YAL005C 505.201128 472.097082 680.92864 599.56587
## YAL007C 301.057121 302.063832 259.94362 267.73414
## YAL008W 9.419511 9.703168 10.51713 10.85248
## YAL012W 2563.701428 2279.260052 2429.33302 2347.98552
## YPS606_EtOH_Rep1
## YAL003W 2705.18447
## YAL005C 25551.22027
## YAL007C 690.08596
## YAL008W 56.01908
## YAL012W 1632.89141
# Only recalculate TOM for modules of interest (faster, altho there's some online discussion if this will be slightly off)
TOM = TOMsimilarityFromExpr(t(expr_of_interest),
power = picked_power)
## TOM calculation: adjacency..
## ..will use 4 parallel threads.
## Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
# Add gene names to row and columns
row.names(TOM) = row.names(expr_of_interest)
colnames(TOM) = row.names(expr_of_interest)
edge_list = data.frame(TOM) %>%
mutate(
gene1 = row.names(.)
) %>%
pivot_longer(-gene1) %>%
dplyr::rename(gene2 = name, correlation = value) %>%
unique() %>%
subset(!(gene1==gene2)) %>%
mutate(
module1 = module_df[gene1,]$colors,
module2 = module_df[gene2,]$colors
)
head(edge_list)
## # A tibble: 6 Ă— 5
## gene1 gene2 correlation module1 module2
## <chr> <chr> <dbl> <chr> <chr>
## 1 YAL003W YAL005C 0.0166 blue turquoise
## 2 YAL003W YAL007C 0.00144 blue turquoise
## 3 YAL003W YAL008W 0.0479 blue turquoise
## 4 YAL003W YAL012W 0.00485 blue blue
## 5 YAL003W YAL017W 0.0346 blue turquoise
## 6 YAL003W YAL023C 0.0236 blue turquoise
What does the distribution of correlations look like?
edge_list %>%
ggplot(aes(x=correlation)) +
geom_histogram(bins=60) +
theme_bw() +
facet_grid(rows=vars(module1), cols = vars(module2))
What is the correct power for this analysis?
# Choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to = 20, by = 2))
# Call the network topology analysis function
sft = pickSoftThreshold(
input_mat, # <= Input data
#blockSize = 30,
powerVector = powers,
verbose = 5
)
## pickSoftThreshold: will use block size 1972.
## pickSoftThreshold: calculating connectivity for given powers...
## ..working on genes 1 through 1972 of 1972
## Warning: executing %dopar% sequentially: no parallel backend registered
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 9.35e-01 2.470000 0.955 947.0 1000.0 1250
## 2 2 8.30e-01 0.887000 0.857 598.0 626.0 946
## 3 3 5.10e-01 0.316000 0.489 423.0 427.0 767
## 4 4 7.30e-06 0.000669 -0.188 319.0 304.0 645
## 5 5 5.31e-01 -0.208000 0.398 251.0 225.0 556
## 6 6 7.82e-01 -0.336000 0.720 203.0 170.0 489
## 7 7 9.00e-01 -0.447000 0.876 168.0 133.0 438
## 8 8 9.56e-01 -0.519000 0.944 142.0 106.0 396
## 9 9 9.66e-01 -0.598000 0.961 121.0 85.8 362
## 10 10 9.62e-01 -0.657000 0.955 105.0 70.1 333
## 11 12 9.86e-01 -0.726000 0.983 81.4 48.5 287
## 12 14 9.68e-01 -0.788000 0.961 65.0 34.9 252
## 13 16 9.74e-01 -0.838000 0.970 53.1 25.6 224
## 14 18 9.79e-01 -0.876000 0.979 44.3 19.3 202
## 15 20 9.71e-01 -0.899000 0.968 37.6 15.0 183
par(mfrow = c(1,2));
cex1 = 0.9;
plot(sft$fitIndices[, 1],
-sign(sft$fitIndices[, 3]) * sft$fitIndices[, 2],
xlab = "Soft Threshold (power)",
ylab = "Scale Free Topology Model Fit, signed R^2",
main = paste("Scale independence")
)
text(sft$fitIndices[, 1],
-sign(sft$fitIndices[, 3]) * sft$fitIndices[, 2],
labels = powers, cex = cex1, col = "red"
)
abline(h = 0.90, col = "red")
plot(sft$fitIndices[, 1],
sft$fitIndices[, 5],
xlab = "Soft Threshold (power)",
ylab = "Mean Connectivity",
type = "n",
main = paste("Mean connectivity")
)
text(sft$fitIndices[, 1],
sft$fitIndices[, 5],
labels = powers,
cex = cex1, col = "red")
It looks like power=7 or 8 is reasonable. Let’s try with 7.
adjacency = adjacency(input_mat, power=7, type="signed")
adjacency[adjacency < 0] = 0
adjacency[adjacency > 1] = 1
TOM = TOMsimilarity(adjacency, TOMType="signed")
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
# Add gene names to row and columns
row.names(TOM) = AnnotationDbi::select(org.Sc.sgd.db,keys=row.names(adjacency),columns="GENENAME") %>% mutate(label = case_when(
is.na(GENENAME) ~ ORF,
TRUE ~ GENENAME
)) %>% pull(label)
## 'select()' returned 1:1 mapping between keys and columns
colnames(TOM) = AnnotationDbi::select(org.Sc.sgd.db,keys=row.names(adjacency),columns="GENENAME") %>% mutate(label = case_when(
is.na(GENENAME) ~ ORF,
TRUE ~ GENENAME
)) %>% pull(label)
## 'select()' returned 1:1 mapping between keys and columns
adj <- TOM
adj[adj > 0.3] = 1
adj[adj != 1] = 0
network <- graph.adjacency(adj)
network <- simplify(network) # removes self-loops
cor <- WGCNA::cor
results <- WGCNA::blockwiseModules(input_mat, power=7, TOMType="signed", networkType="signed")
cor <- stats::cor
V(network)$color <- results$colors
par(mar=c(0,0,0,0))
# remove unconnected nodes
network <- delete.vertices(network, degree(network)==0)
# plot(network, layout=layout.fruchterman.reingold(network, niter=10000),type="o")
# plot(network, layout=layout_with_mds(network), edge.arrow.size = 0.1)
plot(network, layout=layout.fruchterman.reingold(network, niter=1000),
vertex.size=3,edge.arrow.size=0.1,
vertex.label.cex = 0.01, # increase this value to see gene names
vertex.label.color= "black", rescale=TRUE, ylim=c(-1,1),xlim=c(-1,1))
# Convert labels to colors for plotting
mergedColors = labels2colors(results$colors)
# Plot the dendrogram and the module colors underneath
plotDendroAndColors(
results$dendrograms[[1]],
mergedColors[results$blockGenes[[1]]],
"Module colors",
dendroLabels = FALSE,
hang = 0.03,
addGuide = TRUE,
guideHang = 0.05 )
# Get Module Eigengenes per cluster
MEs0 <- moduleEigengenes(input_mat, mergedColors)$eigengenes
# Reorder modules so similar modules are next to each other
MEs0 <- orderMEs(MEs0)
module_order = names(MEs0) %>% gsub("ME","", .)
# Add treatment names
MEs0$treatment = row.names(MEs0)
# tidy & plot data
mME = MEs0 %>%
pivot_longer(-treatment) %>%
mutate(
name = gsub("ME", "", name),
name = factor(name, levels = module_order)
)
mME %>%
ggplot(., aes(x = treatment, y = name, fill = value)) +
geom_tile() +
theme_bw() +
scale_fill_gradient2(
low = "blue",
high = "red",
mid = "white",
midpoint = 0,
limit = c(-1, 1)
) +
theme(axis.text.x = element_text(
angle = 90,
hjust = 1,
vjust = 1
)) +
labs(title = "Module-trait Relationships", y = "Modules", fill = "corr")
(
test_plot <- expr_normalized_DE %>%
data.frame() %>%
mutate(
gene_id = row.names(.)
) %>%
pivot_longer(-gene_id) %>%
mutate(
module = results$colors[gene_id]
) %>%
separate_wider_delim(name, delim = "_", names = c("strain", "stress", "replicate"),cols_remove = FALSE) %>%
ggplot(., aes(x=name, y=value+1, group=gene_id)) +
geom_line(aes(color = "grey"), size=1,
alpha = 0.2) +
geom_line(aes(color = module),
alpha = 0.2) +
# geom_boxplot(width=0.5,
# aes(group=name, color = module),
# outlier.shape = NA,
# alpha=0.9) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90)) +
facet_grid(rows = vars(module),
cols= vars(strain),
scales = "free"
) +
scale_x_discrete(position = "top") +
labs(x = "treatment",
y = "normalized expression") +
# use color variable as the color
scale_color_identity() +
scale_y_log10()
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# create list of genes in each one
ID_network_list <- expr_normalized_DE %>%
data.frame() %>%
mutate(
gene_id = row.names(.)
) %>%
pivot_longer(-gene_id) %>%
mutate(
module = results$colors[gene_id]
) %>%
group_by(module) %>%
# only get genes that are DE
# filter(DE_direction != 0) %>%
group_split() %>% # split into many lists
purrr::set_names(purrr::map_chr(., ~.x$module[1])) %>% # assign names
# get just the ORF names
map(., ~ .x |> pull(all_of("gene_id"))) %>%
# remove duplicates
map(., ~ unique(.))
GO_enrich_network_all <- clusterProfiler::compareCluster(
geneCluster = ID_network_list,
ont = "BP",
OrgDb = org.Sc.sgd.db,
pAdjustMethod = "BH",
pvalueCutoff = 0.5,
qvalueCutoff = 0.5,
readable = FALSE,
fun = 'enrichGO',
keyType = 'ORF'
) |>
# let's add a 'richFactor' column that gives us the proportion of genes DE in the term
mutate(richFactor = Count / as.numeric(sub("/\\d+", "", BgRatio)))
# get rid of sticks to save space
GO_enrich_network_all %>%
as.data.frame() %>%
group_by(Cluster) %>%
slice_min(order_by=pvalue, n = 5) %>%
ggplot(.,
aes(Cluster, fct_reorder(Description, richFactor))) +
geom_point(aes(color = p.adjust, size = Count)) +
scale_color_gradientn(
colours = c("#f7ca64", "#46bac2", "#7e62a3"),
trans = "log10",
guide = guide_colorbar(reverse = TRUE, order = 1)
) +
scale_size_continuous(range = c(2, 12)) +
scale_x_discrete(position = "top") +
scale_y_discrete(labels = function(x) str_wrap(x, width = 25)) +
xlab("Module") +
ylab(NULL) +
ggtitle("Enriched GO Categories by Module") +
# facet_wrap(~DE, scales = "free_y") +
theme_bw() +
theme(text = element_text(size=14),
axis.text.x = element_text(angle = 90,hjust=0) )
# here are the data to add to cowplot:
GO_summaries <- GO_enrich_network_all %>%
as.data.frame() %>%
group_by(Cluster) %>%
slice_min(order_by=p.adjust, n = 5) %>%
slice_head(n = 5) %>%
mutate(Description = case_when(
p.adjust < 0.05 ~ paste0(Description," (",Count,")"),
TRUE ~ " "
)) %>%
summarize(description = str_c(Description, collapse = "\n")) %>%
pull(description)
# draw plot with GO terms
cowplot::ggdraw(test_plot +
theme(plot.margin = margin(t = 0.5, r = 5, b = 0.5, l = 0.1, unit = "cm"))) +
cowplot::draw_text(GO_summaries,
x = 0.76, # Adjust this value for the desired horizontal position
y = rev(seq(0.05, 0.82, length.out = length(GO_summaries))), # Adjust this sequence for the desired vertical positions
size = 6,
hjust = 0, # Align to the left
vjust = 0.5)
Question 1: Which time point of gene (mRNA) expression showed the highest correlation with changes in protein abundance?
Question 2: Does the pattern of mRNA and protein levels match your expectatoins?
Question 3: How is the intersection between differentially expressed gene sets calculated, and why is it relevant?
Be sure to knit this file into a pdf or html file once you’re finished.
System information for reproducibility:
pander::pander(sessionInfo())
R version 4.3.1 (2023-06-16)
Platform: aarch64-apple-darwin20 (64-bit)
locale: en_US.UTF-8||en_US.UTF-8||en_US.UTF-8||C||en_US.UTF-8||en_US.UTF-8
attached base packages: stats4, stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: igraph(v.1.5.1), cowplot(v.1.1.1), org.Sc.sgd.db(v.3.18.0), AnnotationDbi(v.1.64.1), IRanges(v.2.36.0), S4Vectors(v.0.40.1), Biobase(v.2.62.0), BiocGenerics(v.0.48.1), matrixStats(v.1.0.0), edgeR(v.4.0.1), limma(v.3.58.1), WGCNA(v.1.72-1), fastcluster(v.1.2.3), dynamicTreeCut(v.1.63-1), reactable(v.0.4.4), Glimma(v.2.12.0), statmod(v.1.5.0), BiocManager(v.1.30.22), pander(v.0.6.5), knitr(v.1.45), lubridate(v.1.9.3), forcats(v.1.0.0), stringr(v.1.5.0), dplyr(v.1.1.3), purrr(v.1.0.2), readr(v.2.1.4), tidyr(v.1.3.0), tibble(v.3.2.1), ggplot2(v.3.4.4), tidyverse(v.2.0.0) and pacman(v.0.5.1)
loaded via a namespace (and not attached): splines(v.4.3.1), later(v.1.3.1), ggplotify(v.0.1.2), bitops(v.1.0-7), filelock(v.1.0.2), polyclip(v.1.10-6), preprocessCore(v.1.64.0), rpart(v.4.1.21), lifecycle(v.1.0.3), doParallel(v.1.0.17), lattice(v.0.22-5), vroom(v.1.6.4), MASS(v.7.3-60), backports(v.1.4.1), magrittr(v.2.0.3), Hmisc(v.5.1-1), sass(v.0.4.7), rmarkdown(v.2.25), jquerylib(v.0.1.4), yaml(v.2.3.7), httpuv(v.1.6.12), RColorBrewer(v.1.1-3), DBI(v.1.1.3), abind(v.1.4-5), zlibbioc(v.1.48.0), GenomicRanges(v.1.54.1), ggraph(v.2.1.0), RCurl(v.1.98-1.13), yulab.utils(v.0.1.0), nnet(v.7.3-19), tweenr(v.2.0.2), rappdirs(v.0.3.3), GenomeInfoDbData(v.1.2.11), enrichplot(v.1.22.0), ggrepel(v.0.9.4), tidytree(v.0.4.5), codetools(v.0.2-19), DelayedArray(v.0.28.0), ggforce(v.0.4.1), DOSE(v.3.28.0), tidyselect(v.1.2.0), aplot(v.0.2.2), farver(v.2.1.1), viridis(v.0.6.4), BiocFileCache(v.2.10.1), base64enc(v.0.1-3), jsonlite(v.1.8.7), ellipsis(v.0.3.2), tidygraph(v.1.2.3), Formula(v.1.2-5), survival(v.3.5-7), iterators(v.1.0.14), foreach(v.1.5.2), tools(v.4.3.1), treeio(v.1.26.0), HPO.db(v.0.99.2), Rcpp(v.1.0.11), glue(v.1.6.2), gridExtra(v.2.3), SparseArray(v.1.2.2), xfun(v.0.41), DESeq2(v.1.42.0), qvalue(v.2.34.0), MatrixGenerics(v.1.14.0), GenomeInfoDb(v.1.38.0), withr(v.2.5.2), fastmap(v.1.1.1), fansi(v.1.0.5), digest(v.0.6.33), gridGraphics(v.0.5-1), timechange(v.0.2.0), R6(v.2.5.1), mime(v.0.12), colorspace(v.2.1-0), GO.db(v.3.18.0), RSQLite(v.2.3.2), utf8(v.1.2.4), generics(v.0.1.3), data.table(v.1.14.8), graphlayouts(v.1.0.1), httr(v.1.4.7), htmlwidgets(v.1.6.2), S4Arrays(v.1.2.0), scatterpie(v.0.2.1), pkgconfig(v.2.0.3), gtable(v.0.3.4), blob(v.1.2.4), impute(v.1.76.0), XVector(v.0.42.0), shadowtext(v.0.1.2), clusterProfiler(v.4.10.0), htmltools(v.0.5.7), fgsea(v.1.28.0), scales(v.1.2.1), png(v.0.1-8), ggfun(v.0.1.3), rstudioapi(v.0.15.0), tzdb(v.0.4.0), reshape2(v.1.4.4), nlme(v.3.1-163), checkmate(v.2.3.0), curl(v.5.1.0), cachem(v.1.0.8), BiocVersion(v.3.18.0), parallel(v.4.3.1), HDO.db(v.0.99.1), foreign(v.0.8-85), pillar(v.1.9.0), grid(v.4.3.1), vctrs(v.0.6.4), promises(v.1.2.1), dbplyr(v.2.4.0), xtable(v.1.8-4), cluster(v.2.1.4), htmlTable(v.2.4.2), evaluate(v.0.23), cli(v.3.6.1), locfit(v.1.5-9.8), compiler(v.4.3.1), rlang(v.1.1.1), crayon(v.1.5.2), labeling(v.0.4.3), fs(v.1.6.3), plyr(v.1.8.9), stringi(v.1.7.12), viridisLite(v.0.4.2), BiocParallel(v.1.36.0), MPO.db(v.0.99.7), munsell(v.0.5.0), Biostrings(v.2.70.1), lazyeval(v.0.2.2), GOSemSim(v.2.28.0), Matrix(v.1.6-1.1), patchwork(v.1.1.3), hms(v.1.1.3), bit64(v.4.0.5), KEGGREST(v.1.42.0), shiny(v.1.7.5.1), SummarizedExperiment(v.1.32.0), interactiveDisplayBase(v.1.40.0), highr(v.0.10), AnnotationHub(v.3.10.0), memoise(v.2.0.1), bslib(v.0.5.1), ggtree(v.3.10.0), fastmatch(v.1.1-4), bit(v.4.0.5), gson(v.0.1.0) and ape(v.5.7-1)